########## PYTHON EXAMPLES #########
# NOTE: The code was executed using Python version 3.11.40.
# Using a different Python version may require upgrading or downgrading specific libraries, depending on library dependencies.
# Libraries can be installed using the command: pip install <library-name>
# Example: pip install pandas  -> installs the pandas library, which must be done before importing a library using the import command: import pandas

# Specify working directory
import os                                       # load os library
path = '/Impact_Evaluation'                     # the path should be changed by the user
os.chdir(path)                                  # change working directory to path

# 1
import pandas as pd                             # load pandas library
import statsmodels.api as sm                    # load statsmodels library
df = pd.read_csv('data/coffeeleaflet.csv')      # load coffeeleaflet data
df = df.loc[:, ['treatment', 'awarewaste']]     # only keep columns of interest
df = df.dropna()                                # drop missing observations
D = df['treatment']                             # define treatment
D = sm.add_constant(D)                          # add a constant
Y = df['awarewaste']                            # define outcome
results = sm.OLS(Y, D).fit(cov_type = 'HC0')    # estimate impact
print(results.summary())                        # show results

# 2
import pandas as pd                            # load pandas library
import numpy as np                             # load numpy library
import statsmodels.api as sm                   # load statsmodels library
import matplotlib.pyplot as plt                # load matplotlib library
df = pd.read_csv('data/marketing.csv')         # load marketing data
Y = df['sales']                                # define outcome
D = df['newspaper']                            # define treatment
x_axis = np.arange(min(D), max(D))             # define points where we evaluate
results = sm.nonparametric.KernelReg(Y,D,'c', reg_type='lc').fit(x_axis) #kernel
plt.plot(x_axis, results[0], color='black')    # plot average sales
plt.xlabel('newspaper')                        # label x axis
plt.ylabel('sales')                            # label y axis
plt.ylim([12, 26])                             # setting y-axis range
plt.show()                                     # show the plot
plt.plot(x_axis, results[1], color='black')    # plot impact
plt.xlabel('newspaper')                        # label x axis
plt.ylabel('marginal effects')                 # label y axis      
plt.show()                                     # show the plot

# 3
import pandas as pd                            # load pandas library
import statsmodels.api as sm                   # load statsmodels library
df = pd.read_csv('data/coupon.csv')            # load coupon data
X = df.loc[:, df.columns != 'dailyspending']   # define regressors
X = sm.add_constant(X)                         # add a constant 
Y = df['dailyspending']                        # define outcome
results = sm.OLS(Y, X).fit(cov_type = 'HC0')   # run linear regression 
print(results.summary())                       # show results

# 4
from causalinference import CausalModel                       # load causalmodel method
import pandas as pd                                           # load pandas library
import numpy as np                                            # load numpy library
df = pd.read_csv('data/coupon.csv')                           # load coupon data
Y = np.asarray(df['dailyspending'])                           # select outcome
D = np.asarray(df['coupons'])                                 # select treatment
X = np.asarray(df.drop(['dailyspending', 'coupons'], axis=1)) # select covariates
model = CausalModel(Y,D,X)                                    # create causal model
model.est_via_matching(weights='inv', matches=1)              # pair matching
print(model.estimates)                                        # matching output

# 5 
import pandas as pd                                       # load pandas library
import dowhy as dw                                        # load dowhy library
df = pd.read_csv('data/coupon.csv')                       # load coupon data
X = df.drop(['dailyspending', 'coupons'], axis = 1)       # select covariates
model = dw.CausalModel(data = df,                         # create model
treatment = 'coupons',                                    # specify treatment
outcome = 'dailyspending',                                # specify outcome
common_causes = list(X.columns))                          # specify covariates
identified_estimand = model.identify_effect()             # identify the effect
estimate = model.estimate_effect(identified_estimand,     # IPW
method_name = 'backdoor.propensity_score_weighting', 
target_units = 'ate',                                
method_params = {'weighting_scheme': 'ips_weight'})  
print(estimate.value)                                     # show impact
print(estimate.test_stat_significance())                  # show p-value

# 6
import pandas as pd                               # load pandas library
import doubleml as dml                            # load doubleml library
from sklearn import linear_model                  # import models from sklearn
df = pd.read_csv('data/coupon.csv')               # load coupon data
X = df.drop(['dailyspending', 'coupons'], axis=1) # select covariates
dml_data = dml.DoubleMLData(df,                   # create object from coupon data
y_col = 'dailyspending',                          # define outcome
d_cols = 'coupons',                               # define intervention
x_cols = list(X.columns.values))                  # define covariates
ml_l = linear_model.LinearRegression()            # define model for outcome
ml_m = linear_model.LogisticRegression()          # define model for intervention
dr = dml.DoubleMLPLR(dml_data,                    # DR estimation
ml_l,                                 
ml_m).fit()                           
print(dr.summary)                                 # show the results

# 7
import pandas as pd                                      # load pandas library
import doubleml as dml                                   # load doubleml library
from sklearn import linear_model                         # load from sklearn
df = pd.read_csv('data/coupon.csv')                      # load coupon data
X = df.drop(['dailyspending', 'coupons'], axis = 1)      # define covariates
dml_data = dml.DoubleMLData(df,                          # create data 
y_col = 'dailyspending',                                 # define outcome
d_cols = 'coupons',                                      # define intervention
x_cols = list(X.columns.values))                         # define covariates
ml_l = linear_model.LassoCV()                            # learner for outcome
ml_m = linear_model.LogisticRegressionCV(penalty = 'l1', # learner for treatment
solver = 'saga',                                         # solver
max_iter = 350,                                          # max. iterations
Cs = 1,                                                  # inverse regularization
tol = 0.01)                                              # tolerance for stopping
dml_lasso = dml.DoubleMLPLR(dml_data,                    # double machine learning
ml_l,                                                    # outcome model
ml_m,                                                    # treatment model
n_folds = 3).fit()                                       # number of folds
print(dml_lasso.summary)                                 # show the results

# 8
import pandas as pd                                      # load pandas library
import matplotlib.pyplot as plt                          # load matplotlib library
from econml.dml import CausalForestDML                   # load CausalForest from econml
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier  # load base models

df = pd.read_csv('data/coupon.csv')                      # load coupon data
df.columns = df.columns.astype(str)                      # convert column names to strings

Y = df[['dailyspending']]                                # define outcome as DataFrame
D = df[['coupons']]                                      # define treatment as DataFrame
X = df.drop(['dailyspending', 'coupons'], axis=1)        # define covariates

est = CausalForestDML(criterion='het',                   # use criterion for heterogeneity
                      n_estimators=500,                  # number of trees
                      min_samples_leaf=10,               # minimum samples per leaf
                      max_depth=10,                      # maximum tree depth
                      max_samples=0.5,                   # subsample ratio
                      discrete_treatment=True,           # binary treatment
                      model_y=RandomForestRegressor(max_features='sqrt'),  # base model for outcome
                      model_t=RandomForestClassifier())  # base model for treatment

est.fit(Y=Y, T=D, X=X, W=X)                              # fit causal forest

cateX = est.effect(X)                                    # compute CATEs
lb, ub = est.effect_interval(X, alpha=0.05)              # confidence intervals via bootstrap

cates = pd.concat([pd.DataFrame(cateX),                  # combine CATEs
                   pd.DataFrame(lb),                     # lower bounds
                   pd.DataFrame(ub)], axis=1)            # upper bounds
cates.columns = ['CATE', 'Lower Bound', 'Upper Bound']   # rename columns
print(cates)                                             # print CATE table

plt.hist(cateX, color='gray', bins=16, rwidth=0.9)       # plot histogram of CATEs
plt.xlabel('Predictions')                                # set x-axis label
plt.ylabel('Frequency')                                  # set y-axis label
plt.title('Histogram of CATEs')                          # set plot title
plt.show()                                               # show plot

# 9
import pandas as pd                                 # load pandas library
import numpy as np                                  # load numpy library
import doubleml as dml                              # load doubleml library
from sklearn import ensemble                        # load ensemble from sklearn
df = pd.read_csv('data/coupon.csv')                 # load coupon data
X = df.drop(['dailyspending', 'coupons'], axis = 1) # select covariates
dml_data = dml.DoubleMLData(df,                     # create data 
y_col = 'dailyspending',                            # define outcome
d_cols = 'coupons',                                 # define intervention
x_cols = list(X.columns.values))                    # define covariates
ml_g = ensemble.RandomForestRegressor(max_features = 'sqrt') # outcome model 
ml_m = ensemble.RandomForestClassifier()            # intervention model
out = dml.DoubleMLIRM(dml_data,                     # double machine learning 
ml_g,                                               # learner for outcome
ml_m,                                               # learner for intervention
normalize_ipw = True,                               # normalize weights
trimming_rule = 'truncate',                         # trimming approach
trimming_threshold = 0.01).fit()                    # threshold for trimming
df['intercept'] = np.ones(len(df))                  # create constant 
cateX = df.loc[:, ['intercept', 'dailyspending_preperiod']] # X for heterogeneity
CATEs = out.cate(cateX)                             # estimate effect heterogeneity
print(CATEs)                                        # CATEs by past spending                                          

# 10
importances = est.feature_importances_        # importances of X in causal forest
importances = importances.ravel() if importances.ndim > 1 else importances
feature_names = X.columns                     # extract names of X
importance_df = pd.DataFrame({
    'Feature': feature_names, 
    'Importance': importances                 # use flattened importances
})
print(importance_df)                          # report importances of X

# 11
policy_tree = out.policy_tree(features = X, depth = 2) # policies for 4 subgroups
policy_tree.plot_tree()                                # tree with optimal policy

# 12
import statsmodels.api as sm                 # load statsmodels library
import linearmodels.iv.model as lm           # load linearmodels library
import pandas as pd                          # load pandas library
df = pd.read_csv('data/c401k.csv')           # load c401k data
df = sm.add_constant(data=df, prepend=False) # add constant 
ivreg = lm.IV2SLS(dependent = df['nettfa'],  # outcome variable
endog = df['p401k'],                         # intervention
exog = df['const'],                          # covariates
instruments = df['e401k'])                   # instrument 
LATE = ivreg.fit(cov_type = 'robust')        # run instrument regression
LATE.summary                                 # results

# 13
import pandas as pd                                   # load pandas library                                  
from sklearn.linear_model import LogisticRegressionCV # load LogisticRegressionCV
from econml.iv.dr import ForestDRIV                   # load ForestDRIV
import math                                           # load math library
df = pd.read_csv('data/c401k.csv')                    # load c401k data
X = df.iloc[:, 4:11]                                  # select covariates
Y = df.iloc[:, 1]                                     # select outcome
W = df.iloc[:, 2]                                     # select treatment
Z = df.iloc[:, 3]                                     # select instrument
ivforest = ForestDRIV(n_estimators = 2000,            # IV forest
                     min_samples_leaf = 5,            # specify min samples leaf
                     max_depth = min(round(math.sqrt(len(X.columns)))+20, len(X.columns)),  
                     model_t_xw = LogisticRegressionCV(max_iter = 350),                     
                     discrete_treatment = True)       # discrete intervention
ivforest.fit(Y = Y, T = W, Z = Z, X = X)              # fit IV forest with data
ATE = ivforest.ate(X)                                 # compute ATE
INF = ivforest.ate_inference(X)                       # compute p-value
print(ATE)                                            # show ATE
print(INF)                                            # show p-value

# 14
from rdrobust import rdrobust, rdplot   # load rdrobust library
import pandas as pd                     # load pandas library
df = pd.read_csv('data/indh.csv')       # load indh data
Y = df.iloc[:, 0]                       # outcome variable
X = df.iloc[:, 1]                       # running variable
results = rdrobust(y=Y, x=X, c=30)      # RDD (threshold: 30)
print(results)                          # show impact
rdplot(y=Y, x=X, c=30)                  # plot outcome

# 15
import pandas as pd                              # load pandas library
import numpy as np                               # load numpy library
import doubleml as dml                           # load doubleml library
from sklearn import linear_model                 # load linear_model 
df = pd.read_csv('data/nsw_long.csv')            # load nsw_long data
df = df.dropna()                                 # drop missings
df['time'] = np.where(df['year'] == 1975, 0, 1)  # create time variable
df['const'] = np.ones(len(df))                   # create constant term
dml_data = dml.DoubleMLData(df,                  # create data 
y_col = 're',                                    # define outcome
d_cols = 'treated',                              # define intervention  
x_cols = 'const',                                # define covariates
t_col = 'time')                                  # define time
ml_g = linear_model.LinearRegression()           # define outcome model 
ml_m = linear_model.LogisticRegression()         # define intervention model
did = dml.DoubleMLDIDCS(dml_data,                # DiD
ml_g,                                            # outcome model
ml_m).fit()                                      # intervention model
print(did)                                       # show results

# 16
x = df.loc[:, ['educ', 'nodegree', 'age', 'married']] #define covariates
dml_data = dml.DoubleMLData(df,                  # create data 
y_col = 're',                                    # define outcome
d_cols = 'treated',                              # define intervention  
x_cols = list(x.columns.values),                 # define covariates
t_col = 'time')                                  # define time
ml_g = linear_model.LinearRegression()           # define outcome model
ml_m = linear_model.LogisticRegression()         # define intervention model 
did = dml.DoubleMLDIDCS(dml_data,                # DiD
ml_g,                                            # outcome model
ml_m).fit()                                      # intervention model 
print(did)                                       # show results

# 17
import pandas as pd                              # load pandas library
import matplotlib.pyplot as plt                  # load matplotlib library
from synthdid.synthdid import Synthdid as sdid   # load synthdid library
from synthdid.get_data import california_prop99  # load california data
out = sdid(california_prop99(),                  # synthetic did on data
unit = 'State',                                  # define unit
time = 'Year',                                   # define time
treatment = 'treated',                           # define intervention
outcome = 'PacksPerCapita')                      # define outcome
out = out.fit().vcov(method = 'placebo')         # placebo se
out.summary().summary2                           # show results
out.plot_outcomes()                              # plot results
plt.show()                                       # show the plot